import pandas as pd
import seaborn as sns
import plotly.express as px
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
data = pd.read_csv("train.csv")
test_data = pd.read_csv('test.csv')
data.head()
data.columns
data.isnull().sum()
data['Area Income']=data['Area Income'].fillna(data['Area Income'].median())
data.isnull().sum()
data['no_click']=data.Clicked.apply(lambda x: 1 if (x==0) else 0)
data.head()
gender=data.groupby("gender").sum().reset_index()
fig=px.bar(gender, x='gender', y=['Clicked', 'no_click'], title="Click Rate by Gender")
fig.show()
ax=sns.displot(data, x='Daily Time Spent on Site', hue='Clicked', multiple='dodge')
ax.set(title="Clicks by Time on Site")
data['date']=data.Timestamp.apply(lambda x:x[0:10])
data.head()
data.date=data.date.astype(np.datetime64)
data['day']=data['date'].dt.day_name()
data.head()
dayofweek=data.groupby('day').sum().reset_index()
dayofweek.head(7)
fig=px.bar(dayofweek, x='day',y=['Clicked', 'no_click'], title='Clicks by Day of Week',category_orders={'day':['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']}, labels={'day':'Day of Week', 'value':'Count', 'variable':'Click Rate'})
fig.show()
def timemap(x):
if x<30:
return 20
elif x >=30 and x<40:
return 30
elif x>=40 and x <50:
return 40
elif x>=50 and x < 60:
return 50
elif x >= 60 and x<70:
return 60
elif x >= 70 and x<80:
return 70
elif x >= 80 and x<90:
return 80
else:
return 90
data['site_time']=data['Daily Time Spent on Site'].apply(timemap)
data.head()
site=data.groupby('site_time').sum().reset_index()
site.head()
fig=px.bar(site, x='site_time',y=['Clicked','no_click'],title="Clicks by Time Spent on Site", \
labels={'site_time':'Time Spent (minutes)','value':'Count Click/No Click', 'variable':'Click Rate'})
fig.show()
ax=sns.displot(data, x='Age', hue='Clicked', multiple="dodge")
ax.set(xlabel="Age")
def agegroups(x):
if x<30:
return "20-29"
elif x>=30 and x<40:
return "30-39"
elif x >=40 and x<50:
return "40-49"
elif x>=50 and x<60:
return "50-59"
else:
return "60+"
data['age_groups']=data['Age'].apply(agegroups)
data.head()
ages=data.groupby('age_groups').sum().reset_index()
ages.head()
fig=px.bar(ages, x='age_groups',y=['Clicked','no_click'],title="Clicks by Age Groups", \
labels={'age_groups':'Age Groups','value':'Count Click/No Click', 'variable':'Click Rate'})
fig.show()
Those who are 40 and older generate as many clicks as those younger but with a much higher ratio.
def inclevel(x):
if x<30000:
return "<30K"
elif x>=30000 and x<40000:
return "30-39K"
elif x>=40000 and x <50000:
return '40-49K'
elif x>=50000 and x<60000:
return '50-59K'
elif x >=60000 and x <70000:
return '60-69K'
else:
return '70K+'
data['income_level']=data['Area Income'].apply(inclevel)
income=data.groupby('income_level').sum().reset_index()
income.head()
fig=px.bar(income, x='income_level', y=['Clicked', 'no_click'], title='Income Levels and Clicks', labels={'income_level':"Income Groups", 'value':"Count Click/No Click", 'variable':'Click Rate'}, category_orders={'income_level':['<30K', '30-39K', '40-49K', '50-59K', '60-69K','70K+']})
fig.show()
ax=sns.displot(data, x='Daily Internet Usage', hue='Clicked', multiple="dodge")
ax.set(xlabel="Time Spent on the Internet")
data['total_time']=data['Daily Internet Usage']<=175
data['total_time'].value_counts()
total_t=data.groupby('total_time').sum().reset_index()
total_t.head()
fig=px.bar(total_t, x='total_time', y=['Clicked', 'no_click'], title="How total time on the internet affects clicks", labels={'total_time':"Spends 175 minutes or less per day on the internet", 'value':'Count', 'variable':'Click Rate'})
fig.show()
topic=data.groupby('Ad Topic Line').sum().reset_index()
topic
effective1=topic[(topic['Clicked']>1) & (topic['no_click']==0)]
highly_effective=effective1['Ad Topic Line'].tolist()
len(highly_effective)
effective2=topic[(topic['Clicked']==1) & (topic['no_click']==0)]
effective=effective2['Ad Topic Line'].tolist()
len(effective)
effective3=topic[(topic['Clicked']<=1) & (topic['no_click']>0)]
ineffective=effective3['Ad Topic Line'].tolist()
len(ineffective)
effective4=topic[(topic['Clicked']<=1) & (topic['no_click']>0)]
ineff=effective4['Ad Topic Line'].tolist()
len(ineff)
effective4['no_click'].sum()
def effectiveness(x):
if x in highly_effective:
return "High"
if x in effective:
return "Medium"
else:
return "Low"
data['ad_effectiveness']=data['Ad Topic Line'].apply(effectiveness)
data['ad_effectiveness'].value_counts()
data.columns
df=data.drop(columns=['id', 'Timestamp', 'Daily Time Spent on Site', 'Age', 'Area Income',
'Daily Internet Usage', 'Ad Topic Line', 'Country', 'no_click', 'date'], axis=1)
df.head()
df.shape
df.columns
df_y=df['Clicked']
df_y.shape
feature=['day', 'site_time', 'age_groups', 'income_level','total_time', 'ad_effectiveness']
df_X=pd.get_dummies(df[feature])
df_X.shape
df_X['gender']=df['gender']
df_X.shape
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(df_X, df_y, test_size=0.20, random_state=7)
print(X_train.shape)
print(y_train.shape)
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier(n_estimators=75, max_depth=5, random_state=7)
rfc.fit(X_train, y_train)
prediction=rfc.predict(X_train)
rfc.score(X_test, y_test)